package com.duowan.mspider.processor;

import com.duowan.mspider.util.Constant;
import com.duowan.mspider.util.FileUtils;

import java.io.File;
import java.io.IOException;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;

import static com.duowan.mspider.util.Constant.HTML_DIR;

/**
 * 对html站点爬虫结果（htmlsiteresult目录）合并去重
 *
 * @author:wangyan1
 * @create_time: 2018-06-01 17:03
 */

public class HtmlSiteDistinctFile {

	public static void main(String[] args) throws IOException {
		File directory = new File(HTML_DIR);
		if (!directory.exists()) return;

		Set<String> result = new LinkedHashSet<>();
		File[] files = directory.listFiles();
		for (File file : files) {
			List<String> list = FileUtils.read(file);
			for (String str : list) {
				result.add(str);
			}
		}

		File file = new File(Constant.HTML_RESULT);
		if(file.exists()) file.delete();
		
		for (String str : result) {
			FileUtils.write(file, str);
		}
	}

}
